1 Overview

This R Markdown document reproduces the main analyses and figures for tumor evolution in high-clonality LUAD (Lung Adenocarcinoma) dataset from the Sherlock-Lung study

manuscript title: Deciphering lung adenocarcinoma evolution and the role of LINE-1 retrotransposition

Fig. 3: Characterization of tumors with mutational signature ID2.


2 Load Required Libraries and Set Plotting Styles

(You may need to install some packages if missing)

library(tidyverse)
library(ggplot2)
library(ggsankey)
library(scales)
library(hrbrthemes)   # For theme_ipsum_rc
library(cowplot)
library(forcats)
library(ggrepel)
library(ggnewscale)
library(data.table)
library(ggasym)
library(ggpmisc)
library(broom)
library(ggsci)
library(conflicted)

conflicts_prefer(dplyr::filter)
conflicts_prefer(dplyr::rename)
pdfhr2()

# --- Define Helper Functions (if any custom) -----------------------------------
# Please source your custom functions here if needed, or place them in ./functions/
# source('./functions/your_custom_functions.R')

# --- Load Input Datasets ------------------------------------------------------
# All data files should be placed in the appropriate subfolders.
# Example folder structure:
#   ./data/           - for main input files
#   ./functions/      - for custom R functions
#   ./output/         - for saving plots and results

# Replace the filenames below with your actual dataset filenames.
load('./data/BBsolution_final3_short.RData',verbose = T)
## Loading objects:
##   BBsolution
##   BBsolution4
##   hq_samples
##   wgs_groups_info
##   wgs_groups_info2
load('./data/covdata0.RData',verbose = T)
## Loading objects:
##   covdata0
load('./data/clinical_data.RData',verbose = T)
## Loading objects:
##   clinical_data
load('./data/sherlock_data_all.RData',verbose = T)
## Loading objects:
##   sherlock_data
##   sherlock_data_full
##   sherlock_overall
##   sherlock_type_colors
##   sherlock_freq
##   sherlock_freq_hq
load('./data/sherlock_variable.RData',verbose = T)
## Loading objects:
##   sherlock_variable
load('./data/sp_group_data.RData',verbose = T)
## Loading objects:
##   sp_group_color
##   sp_group_color_new
##   sp_group_data
##   sp_group_data2
load('./data/id2data.RData',verbose = T)
## Loading objects:
##   id2data
##   id2color
load('./data/tedata.RData',verbose = T)
## Loading objects:
##   tedata
load('./data/suvdata.RData',verbose = T)
## Loading objects:
##   suvdata
load('./data/RNASeq_Exp.RData',verbose = T)
## Loading objects:
##   cdata3
##   rdata1
##   adata
load('./data/Signature_Lumidl_CN.RData',verbose = T)
## Loading objects:
##   cn68_activity
##   cn68_denovo_activity_nonsmoker
##   cn68_denovo_activity_smoker
##   cn68_signature
##   cn68_decompsite
##   cn68_signature_denovo_nonsmoker
##   cn68_signature_denovo_smoker
##   cn68_decompsite_denovo_nonsmoker
##   cn68_decompsite_denovo_smoker
##   cn68_denovo_mapping_nonsmoker
##   cn68_denovo_mapping_detail_nonsmoker
load('./data/Signature_Lumidl_SV.RData',verbose = T)
## Loading objects:
##   sv38_activity
##   sv38_denovo_activity_nonsmoker
##   sv38_denovo_activity_smoker
##   sv38_signature
##   sv38_decompsite
##   sv38_signature_denovo_nonsmoker
##   sv38_signature_denovo_smoker
##   sv38_decompsite_denovo_nonsmoker
##   sv38_decompsite_denovo_smoker
##   sv38_denovo_mapping_nonsmoker
##   sv38_denovo_mapping_detail_nonsmoker
load('./data/sherlock_profiles.RData',verbose = T)
## Loading objects:
##   LCINS
##   sherlock_dbs78_profile
##   sherlock_sbs96_profile
##   sherlock_id83_profile
##   sherlock_sbs288_profile
##   sherlock_cn68_profile
##   sherlock_cn48_profile
##   sherlock_sv38_profile
##   sherlock_sv32_profile
# load function -----------------------------------------------------------
load('./data/ZTW_functions.RData',verbose = T)
## Loading objects:
##   barcode2spg
##   sp_group_color
##   sp_group_color_new
##   sp_group_lift
##   sp_group_data
##   sp_group_major_new
##   sp_group_major
#source('./data/ZTW_functions.R')
source('./functions/Sherlock_functions.R')

# load analysis related data set
load('./data/Chronological_timing_short.RData',verbose = T)
## Loading objects:
##   MRCAdata
##   subclonesTimeAbs
##   subclonesTimeAbsLinear
##   mrate
##   rateDeam
##   timingclass_groups
##   timingInfo
tmp <- colnames(MRCAdata)
MRCAdata <- MRCAdata %>% select(-SP_Group) %>% left_join(sp_group_data2) %>% select(-SP_Group_New) %>% select(one_of(tmp))

## analysis limited to luad only
hq_samples2 <- covdata0 %>% filter(Histology == 'Adenocarcinoma', Tumor_Barcode %in% hq_samples) %>% pull(Tumor_Barcode)
rm(list=c('hq_samples'))

3 Fig. 3a-b: Association between Cell Proliferation Marker Gene Expression and Mutational Signature ID2

Fig. 3a: Relationships between mutational signature ID2 presence and gene expression of tumor proliferation markers, analyzed from tumor and normal tissue RNA-Seq data. The horizontal line represents the significance threshold (FDR<0.05).

Fig. 3b: Pearson correlation between the number of deletions attributed to mutational signature ID2 and the gene expression of tumor proliferation markers. Pearson correlation coefficients and corresponding p-values are displayed above the plots.

# ID2 with all proliferation markers
genelist <- c('MKI67','TOP2A','MYBL2','BUB1','PLK1','CCNE1','CCNB1','BUB1','FOXM1')
tdata <- rdata1 %>% filter(Gene %in% genelist)

tdata <- id2data %>% 
  filter(Tumor_Barcode %in% hq_samples2) %>% 
  left_join(tdata) %>% 
  filter(!is.na(Exp))


fcdata <- tdata %>% 
  group_by(RNAseq_Type,Gene,ID2_Present) %>% 
  summarise(mvalue=median(Exp)) %>% 
  ungroup() %>% 
  pivot_wider(names_from = ID2_Present,values_from = mvalue) %>% 
  mutate(FC=Present/Absent)


tdata %>% 
  group_by(RNAseq_Type,Gene) %>% 
  do(tidy(wilcox.test(Exp~ID2_Present, data=.))) %>% 
  left_join(fcdata) %>% 
  group_by(RNAseq_Type) %>% 
  mutate(FDR=p.adjust(p.value)) %>% 
  ggplot(aes(log2(FC),-log10(FDR),fill=RNAseq_Type))+
  geom_hline(yintercept = -log10(0.05),linetype=2,col='red',size=0.5)+
  # geom_vline(xintercept = c(-5,5),linetype=2,col='blue',size=0.5)+
  geom_point(pch=21,stroke=0.4,size=3)+
  scale_fill_nejm()+
  scale_x_continuous(breaks = pretty_breaks(n=6))+
  scale_y_continuous(breaks = pretty_breaks(n = 6))+
  ggrepel::geom_text_repel(aes(label=Gene),size=3,max.overlaps = 20,force = 20)+
  labs(x='log2(Fold change)',y='-log10(FDR)',fill='RNA-Seq data')+
  guides(fill = guide_legend(override.aes = list(size=3.5)))+
  theme_ipsum_rc(base_size = 12,axis_title_just = 'm',axis_title_size = 14,grid='XY',ticks = T)+
  theme(legend.position = 'top')+
  panel_border(color = 'black',size = 0.5)+
  coord_cartesian(clip = 'off')

#ggsave(filename = './output/ID2_Proliferation_Markers.pdf',width = 5,height = 5,device = cairo_pdf())


tdata %>% 
  left_join(covdata0) %>% 
  group_by(Gene,RNAseq_Type) %>% 
  do(tidy(lm(Exp~ID2_Present + Assigned_Population + Gender + Smoking + Tumor_Purity + Age, data=.))) %>% 
  filter(term=='ID2_PresentPresent') %>% 
  arrange((p.value))
## # A tibble: 16 × 7
## # Groups:   Gene, RNAseq_Type [16]
##    Gene  RNAseq_Type term               estimate std.error statistic  p.value
##    <chr> <chr>       <chr>                 <dbl>     <dbl>     <dbl>    <dbl>
##  1 FOXM1 Tumor       ID2_PresentPresent   1.39       0.164    8.44   7.89e-16
##  2 PLK1  Tumor       ID2_PresentPresent   1.22       0.147    8.29   2.34e-15
##  3 MYBL2 Tumor       ID2_PresentPresent   1.72       0.212    8.13   6.90e-15
##  4 MKI67 Tumor       ID2_PresentPresent   1.23       0.158    7.83   5.70e-14
##  5 TOP2A Tumor       ID2_PresentPresent   1.37       0.184    7.44   7.70e-13
##  6 BUB1  Tumor       ID2_PresentPresent   1.29       0.175    7.34   1.49e-12
##  7 CCNB1 Tumor       ID2_PresentPresent   1.20       0.165    7.27   2.22e-12
##  8 CCNE1 Tumor       ID2_PresentPresent   1.07       0.200    5.37   1.41e- 7
##  9 CCNB1 Normal      ID2_PresentPresent   0.118      0.131    0.901  3.68e- 1
## 10 BUB1  Normal      ID2_PresentPresent   0.123      0.144    0.853  3.95e- 1
## 11 MKI67 Normal      ID2_PresentPresent   0.110      0.151    0.732  4.65e- 1
## 12 TOP2A Normal      ID2_PresentPresent   0.124      0.170    0.726  4.69e- 1
## 13 FOXM1 Normal      ID2_PresentPresent   0.0658     0.125    0.526  5.99e- 1
## 14 PLK1  Normal      ID2_PresentPresent   0.0370     0.114    0.324  7.46e- 1
## 15 MYBL2 Normal      ID2_PresentPresent   0.0552     0.180    0.307  7.59e- 1
## 16 CCNE1 Normal      ID2_PresentPresent  -0.0161     0.163   -0.0985 9.22e- 1
# association

tdata2 <- tdata %>% 
  filter(ID2>0, RNAseq_Type=='Tumor') %>% 
  select(Tumor_Barcode,ID2,Gene,Exp) %>% 
  pivot_wider(names_from = Gene,values_from = Exp) %>% 
  mutate(ID2=log2(ID2))

library(ggcorrplot)

corr <- round(cor(tdata2[,-1]), 1)
p.mat <- cor_pmat(tdata2[,-1])

ggcorrplot(corr)

ggcorrplot(corr,method = "circle",insig = "blank",
           hc.order = TRUE,
           type = "lower",
           p.mat = p.mat,
           lab = TRUE)

#ggsave(filename = './output/ID2_Proliferation_Markers2.pdf',width = 5,height = 5,device = cairo_pdf())

# box plot
tdata %>% 
  ggplot(aes(ID2_Present,Exp))+
  geom_point(aes(fill=ID2_Present),pch=21,size=2.5,position = position_jitter(width = 0.2,height = 0))+
  geom_boxplot(width=0.7,outlier.shape = NA,fill=NA,size=1)+
  facet_grid(RNAseq_Type~Gene,scales ='free') +
  scale_fill_manual(values = id2color)+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = 'Y')+
  guides(fill='none')+#fill = guide_legend(override.aes=list(shape=21)),)+
  labs(x='ID2 Mutational Signature',y='RNA-Seq gene expression log2(CPM)')+
  theme(axis.text.x = element_text(angle = 45,hjust = 1,vjust = 1),panel.spacing = unit(0.1,'cm'))+
  panel_border(color = 'black',linetype = 1)

#ggsave('./output/ID2_Proliferation_RNASeq.pdf',width = 10,height = 9,device = cairo_pdf)

tdata %>% 
  group_by(RNAseq_Type,Gene) %>% 
  do(tidy(wilcox.test(Exp~ID2_Present,data=.))) 
## # A tibble: 16 × 6
## # Groups:   RNAseq_Type, Gene [16]
##    RNAseq_Type Gene  statistic  p.value method                       alternative
##    <chr>       <chr>     <dbl>    <dbl> <chr>                        <chr>      
##  1 Normal      BUB1       4557 7.38e- 1 Wilcoxon rank sum test with… two.sided  
##  2 Normal      CCNB1      4821 8.03e- 1 Wilcoxon rank sum test with… two.sided  
##  3 Normal      CCNE1      4833 7.82e- 1 Wilcoxon rank sum test with… two.sided  
##  4 Normal      FOXM1      4977 5.51e- 1 Wilcoxon rank sum test with… two.sided  
##  5 Normal      MKI67      4691 9.71e- 1 Wilcoxon rank sum test with… two.sided  
##  6 Normal      MYBL2      4769 8.93e- 1 Wilcoxon rank sum test with… two.sided  
##  7 Normal      PLK1       4868 7.23e- 1 Wilcoxon rank sum test with… two.sided  
##  8 Normal      TOP2A      4872 7.17e- 1 Wilcoxon rank sum test with… two.sided  
##  9 Tumor       BUB1       5343 4.27e-13 Wilcoxon rank sum test with… two.sided  
## 10 Tumor       CCNB1      5250 1.87e-13 Wilcoxon rank sum test with… two.sided  
## 11 Tumor       CCNE1      6497 4.37e- 9 Wilcoxon rank sum test with… two.sided  
## 12 Tumor       FOXM1      4669 8.20e-16 Wilcoxon rank sum test with… two.sided  
## 13 Tumor       MKI67      5262 2.08e-13 Wilcoxon rank sum test with… two.sided  
## 14 Tumor       MYBL2      4765 2.08e-15 Wilcoxon rank sum test with… two.sided  
## 15 Tumor       PLK1       4885 6.52e-15 Wilcoxon rank sum test with… two.sided  
## 16 Tumor       TOP2A      5254 1.94e-13 Wilcoxon rank sum test with… two.sided
# scatter plot
tmp <- tdata %>% 
  filter(ID2>0,RNAseq_Type=='Tumor') %>% 
  group_by(Gene) %>% 
  do(tidy(cor.test(log2(.$ID2),.$Exp))) %>% 
  ungroup() %>% 
  arrange(p.value) %>% 
  mutate(label=paste0(Gene,'\nR=',round(estimate,2),'; P=',scientific(p.value))) %>% 
  select(Gene,label) %>% 
  mutate(label=fct_inorder(label))

tdata %>% 
  filter(ID2>0,RNAseq_Type=='Tumor') %>% 
  left_join(tmp) %>% 
  ggplot(aes(log2(ID2),Exp))+
  geom_point(pch=21,stroke=0.5,fill=id2color[2],size=2)+
  facet_wrap(~label,nrow = 2)+
  geom_smooth(method = 'lm')+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = 'Y',strip_text_face = 'bold')+
  labs(x='Number of ID2 deletions (log2)',y='RNA-Seq expression log2(CPM)')+
  theme(panel.spacing = unit(0.2,'lines'),strip.text.x = element_text(face = 'plain',hjust = 0.5))+
  coord_cartesian(clip = 'off')+
  panel_border(color = 'black',linetype = 1,size=0.5)+
  guides(fill="none")

#ggsave('./output/ID2_Proliferation_RNASeq_tumor.pdf',width = 8,height = 5.5,device = cairo_pdf)


tmp <- tdata %>% 
  filter(ID2>0,RNAseq_Type=='Normal') %>% 
  group_by(Gene) %>% 
  do(tidy(cor.test(log2(.$ID2),.$Exp))) %>% 
  ungroup() %>% 
  arrange(p.value) %>% 
  mutate(label=paste0(Gene,'\nR=',round(estimate,2),'; P=',round(p.value,2))) %>% 
  select(Gene,label) %>% 
  mutate(label=fct_inorder(label))

tdata %>% 
  filter(ID2>0,RNAseq_Type=='Normal') %>% 
  left_join(tmp) %>% 
  ggplot(aes(log2(ID2),Exp))+
  geom_point(pch=21,stroke=0.5,fill=id2color[2],size=2)+
  facet_wrap(~label,nrow = 2)+
  geom_smooth(method = 'lm')+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = 'Y',strip_text_face = 'bold')+
  labs(x='Number of ID2 deletions (log2)',y='RNA-Seq expression log2(CPM)')+
  theme(panel.spacing = unit(0.2,'lines'),strip.text.x = element_text(face = 'plain',hjust = 0.5))+
  coord_cartesian(clip = 'off')+
  panel_border(color = 'black',linetype = 1,size=0.5)+
  guides(fill="none")

#ggsave('./output/ID2_Proliferation_RNASeq_normal.pdf',width = 8,height = 5.5,device = cairo_pdf)

4 Fig. 3c: Association between mutational signature ID2 and survival

Fig. 3c: Kaplan-Meier survival curves for overall survival, stratified by the presence of mutational signature ID2. Significance p-values and Hazard Ratios (HRs) were calculated using two-sided Cox proportional-hazards regression, adjusting for age, sex, smoking, and tumor stage. Numbers in brackets indicate the number of patients.

# ID2 survival 
source('./functions/Survival_function.R')
suvdata <- suvdata %>% left_join(wgs_groups_info %>% select(Tumor_Barcode,SP_Group))
suvdata <- suvdata %>% 
  left_join(
    sherlock_data_full %>% filter(Gene=='TP53',Type=='Mutation_Driver') %>% select(Tumor_Barcode,TP53_Status=Alteration) %>% mutate(TP53_Status = factor(TP53_Status, levels=c('No','Yes')))
  ) 

suvdata_tmp <- id2data %>% 
  rename(Key = ID2_Present) %>% 
  left_join(suvdata) %>% 
  mutate(Key = factor(Key, levels = c('Absent','Present'))) %>% 
  filter(Tumor_Barcode %in% hq_samples2) 
#mutate(Survival_Month = if_else(Survival_Month > 60, NA, Survival_Month))

sinfo1 <- SurvZTWms(suvdata_input = suvdata_tmp,plot = TRUE,keyname='Mutational Signature ID2: ',gcolors = as.character(id2color),pvalsize = 4,width = 5,height = 5,filename = 'ID2-survival.pdf')
## # A tibble: 1 × 3
##   term    p.value lab                                               
##   <chr>     <dbl> <chr>                                             
## 1 Present 0.00208 "Present:\nP = 0.0021\nHR = 1.8 (95% CI: 1.2-2.6)"
## theme(axis.text.x=element_text(hjust=c(0, rep(0.5, 4), 1))) +
## theme(axis.text.y=element_text(vjust=c(0, rep(0.5, 4), 1)))
print(sinfo1)
## # A tibble: 1 × 3
##   term    p.value lab                                               
##   <chr>     <dbl> <chr>                                             
## 1 Present 0.00208 "Present:\nP = 0.0021\nHR = 1.8 (95% CI: 1.2-2.6)"
sinfo2 <- SurvZTWms_tp53(suvdata_input = suvdata_tmp,plot = TRUE,keyname='Mutational Signature ID2: ',gcolors = as.character(id2color),pvalsize = 4,width = 5,height = 4,filename = 'ID2-survival-tp53.pdf')
## theme(axis.text.x=element_text(hjust=c(0, rep(0.5, 4), 1))) +
## theme(axis.text.y=element_text(vjust=c(0, rep(0.5, 4), 1)))
print(sinfo2)
## # A tibble: 1 × 3
##   term    p.value lab                                            
##   <chr>     <dbl> <chr>                                          
## 1 Present  0.0328 "Present:\nP = 0.033\nHR = 1.5 (95% CI: 1-2.2)"
## check the output pdf file for the survival plots

5 Fig. 3d Enrichment of tumor metastasis in tumors with mutational signature ID2.

Odds ratios and p-values from the Fisher exact test are shown above the plot.

clinical_data %>% select(Tumor_Barcode,Metastasis_old)
## # A tibble: 1,217 × 2
##    Tumor_Barcode   Metastasis_old
##    <chr>           <chr>         
##  1 IGC-02-1001-T03 Yes           
##  2 IGC-02-1016-T01 Yes           
##  3 IGC-02-1017-T01 Yes           
##  4 IGC-02-1025-T01 Yes           
##  5 IGC-02-1029-T01 No            
##  6 IGC-02-1074-T01 No            
##  7 IGC-02-1097-T01 Yes           
##  8 IGC-02-1105-T01 Yes           
##  9 IGC-02-1169-T01 No            
## 10 IGC-02-1194-T01 Yes           
## # ℹ 1,207 more rows
tdata <- id2data %>% 
  left_join(clinical_data %>% select(Tumor_Barcode,Metastasis=Metastasis_after_diagnosis) %>% mutate(Metastasis = factor(Metastasis, levels = c('No','Yes')))
  ) %>% 
  filter(!is.na(Metastasis)) %>% 
  filter(Tumor_Barcode %in% hq_samples2) %>% 
  left_join(sp_group_data2) %>% 
  left_join(
    sherlock_data_full %>% filter(Gene=='TP53',Type=='Mutation_Driver') %>% select(Tumor_Barcode,TP53_Status=Alteration) %>% mutate(TP53_Status = factor(TP53_Status, levels=c('No','Yes'),labels=c('TP53 wildtype','TP53 mutant')))
  )

tdata %>% do(tidy(fisher.test(.$ID2_Present,.$Metastasis)))
## # A tibble: 1 × 6
##   estimate p.value conf.low conf.high method                         alternative
##      <dbl>   <dbl>    <dbl>     <dbl> <chr>                          <chr>      
## 1     2.45 0.00558     1.24      4.78 Fisher's Exact Test for Count… two.sided
tdata %>% group_by(TP53_Status) %>%  do(tidy(fisher.test(.$ID2_Present,.$Metastasis)))
## # A tibble: 2 × 7
## # Groups:   TP53_Status [2]
##   TP53_Status   estimate  p.value conf.low conf.high method          alternative
##   <fct>            <dbl>    <dbl>    <dbl>     <dbl> <chr>           <chr>      
## 1 TP53 wildtype     5.25 0.000824    1.81      15.7  Fisher's Exact… two.sided  
## 2 TP53 mutant       1.09 1           0.403      2.85 Fisher's Exact… two.sided
tdata %>% group_by(SP_Group_New) %>% do(tidy(fisher.test(.$ID2_Present,.$Metastasis)))
## # A tibble: 4 × 7
## # Groups:   SP_Group_New [4]
##   SP_Group_New estimate p.value conf.low conf.high method            alternative
##   <chr>           <dbl>   <dbl>    <dbl>     <dbl> <chr>             <chr>      
## 1 AS_N            0      1         0        190.   Fisher's Exact T… two.sided  
## 2 EU_N            2.76   0.0747    0.832      8.89 Fisher's Exact T… two.sided  
## 3 EU_S            0.702  0.585     0.210      2.37 Fisher's Exact T… two.sided  
## 4 Others          0.871  1         0.107      5.92 Fisher's Exact T… two.sided
tdata %>% filter(ID2>0) %>%  do(tidy(wilcox.test(ID2~Metastasis,data=.)))
## # A tibble: 1 × 4
##   statistic   p.value method                                         alternative
##       <dbl>     <dbl> <chr>                                          <chr>      
## 1      1815 0.0000274 Wilcoxon rank sum test with continuity correc… two.sided
my_comparisons = list(c('Yes','No'))
#myggstyle()
tdata %>% 
  filter(ID2>0) %>% 
  ggplot(aes(Metastasis,log2(ID2)))+
  geom_point(aes(fill=Metastasis),pch=21,size=3,position = position_jitter(width = 0.2,height = 0),stroke=0.2)+
  geom_boxplot(width=0.6,outlier.shape = NA,fill=NA,col=ncicolpal[1],size=0.6)+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
  theme(axis.text.x = element_text(angle = 45,hjust = 1,vjust = 1),panel.spacing = unit(0.2, "lines"),strip.background = element_rect(fill = 'gray95'),strip.text = element_text(hjust = 0.5),plot.margin = margin(4,4,4,4))+
  labs(x = "Tumor Metastasis", y = 'ID2 deletions (log2)')+
  scale_fill_manual(values = id2color)+
  guides(fill="none")+
  #facet_wrap(~TP53_Status,nrow = 1)+
  panel_border(color = 'black',linetype = 1)+
  stat_compare_means(comparisons = my_comparisons)

#ggsave(file='ID2_metastasis.pdf',width = 2,height = 6,device = cairo_pdf)

# overall
barplot_fisher(tdata,'ID2_Present','Metastasis',var1lab = 'Mutational signature ID2',var2lab = 'Tumor metastasis')
## [1] "Mutational signature ID2"
## # A tibble: 1 × 6
##   estimate p.value conf.low conf.high method                         alternative
##      <dbl>   <dbl>    <dbl>     <dbl> <chr>                          <chr>      
## 1     2.45 0.00558     1.24      4.78 Fisher's Exact Test for Count… two.sided

# TP53 wildtype
barplot_fisher(tdata %>% filter(TP53_Status=='TP53 wildtype'),'ID2_Present','Metastasis',var1lab = 'Mutational signature ID2',var2lab = 'Tumor metastasis')
## [1] "Mutational signature ID2"
## # A tibble: 1 × 6
##   estimate  p.value conf.low conf.high method                        alternative
##      <dbl>    <dbl>    <dbl>     <dbl> <chr>                         <chr>      
## 1     5.25 0.000824     1.81      15.7 Fisher's Exact Test for Coun… two.sided

# TP53 mutant
barplot_fisher(tdata %>% filter(TP53_Status=='TP53 mutant'),'ID2_Present','Metastasis',var1lab = 'Mutational signature ID2',var2lab = 'Tumor metastasis')
## [1] "Mutational signature ID2"
## # A tibble: 1 × 6
##   estimate p.value conf.low conf.high method                         alternative
##      <dbl>   <dbl>    <dbl>     <dbl> <chr>                          <chr>      
## 1     1.09       1    0.403      2.85 Fisher's Exact Test for Count… two.sided

tdata %>% 
  left_join(covdata0) %>%
  filter(Smoking=='Non-Smoker') %>% 
  do(tidy(glm(Metastasis ~ ID2_Present + TP53_Status + Tumor_Purity + Age + Gender, family='binomial',data=.)))
## # A tibble: 6 × 5
##   term                   estimate std.error statistic p.value
##   <chr>                     <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)             -1.44      1.53      -0.947 0.344  
## 2 ID2_PresentPresent       1.26      0.474      2.67  0.00767
## 3 TP53_StatusTP53 mutant   0.569     0.399      1.43  0.153  
## 4 Tumor_Purity            -0.220     1.42      -0.155 0.877  
## 5 Age                     -0.0129    0.0188    -0.684 0.494  
## 6 GenderFemale             0.381     0.481      0.794 0.427

6 Fig. 3e Genomic Features Association with mutational signature ID

Enrichment of genomic alterations (WGD=whole genome doubling; SCNA=somatic copy number alterations) in tumors with mutational signature ID2, determined through logistic regression and adjusted for ancestry, sex, smoking status, age, and tumor purity. The horizontal lines represent significance thresholds (FDR<0.05 in orange and FDR<0.01 in red)

drglist <- readRDS('./data/drivers_intogene.RDS') %>% pull(symbol)
drglist <- unique(c(drglist,c('RET','ALK','AXL','NRG1','MET','FGFR2','ROS1','RB1','ERBB4','EGFR')))

drglist <- c(drglist,paste0(drglist,'-Amp'),paste0(drglist,'-Del'))

features=c('WGD_Status','Distal_Proximal','A3A_or_A3B','Kataegis','HRDetect_Status','Forte','All_L1')

tdata <- sherlock_data_full %>% 
  mutate(Type=if_else(Gene=='All_L1','Overall_Feature',Type)) %>% 
  filter(Tumor_Barcode %in% hq_samples2,Gene %in% c(features,drglist),!(Type %in% c('Gene_Alterations','Multiple_Mutations','Pathogenic_Germline'))) %>% 
  filter(Tumor_Barcode %in% hq_samples2) %>% 
  mutate(Alteration = if_else(Alteration == 'WGD','Yes',if_else(Alteration == 'nWGD','No',Alteration))) %>% 
  mutate(Alteration = if_else(Alteration == 'A3A','Yes',if_else(Alteration == 'A3B','No',Alteration))) %>% 
  left_join(id2data) %>% 
  mutate(Type=factor(Type,levels=c('Overall_Feature','Mutation_Driver','SCNA_Focal_Gene','Fusion')))

tresult <- tdata %>% 
  left_join(covdata0) %>% 
  group_by(Type,Gene) %>% 
  mutate(ID2_Present= as.integer(as.factor(ID2_Present))-1) %>% 
  do(tresult = safely(glm)(ID2_Present ~ Alteration + Assigned_Population + Gender + Age + Smoking + Tumor_Purity, family = "binomial", data=. )) %>% 
  mutate(tresult_null = map_lgl(tresult['result'], is.null)) %>% 
  filter(!tresult_null) %>% 
  mutate(fit = list(tidy(tresult[['result']]))) %>% 
  select(Type,Gene,fit) %>% 
  unnest(cols = c(fit)) %>% 
  ungroup() %>% 
  arrange(p.value) %>% 
  filter(term!='(Intercept)', term=='AlterationYes') %>% 
  filter(abs(estimate)<10) %>% 
  mutate(FDR=p.adjust(p.value))


tresult %>% 
  filter(!is.na(Type)) %>% 
  ggplot(aes((estimate),-log10(FDR)))+
  geom_hline(yintercept = -log10(0.01),linetype=2,col='red',size=0.5)+
  geom_hline(yintercept = -log10(0.05),linetype=2,col='orange',size=0.5)+
  geom_point(aes(fill=Type),pch=21,stroke=0.1,col='black',size=4)+
  scale_x_continuous(breaks = pretty_breaks())+
  scale_y_continuous(breaks = pretty_breaks())+
  ggrepel::geom_text_repel(data=tresult %>%filter(FDR<0.05), aes(label=Gene),force = 10,size=4)+
  scale_fill_manual(values = ncicolpal)+
  labs(x='Regression coefficient for tumor latency',y='-log10(FDR)',fill='Alterations or features')+
  guides(fill = guide_legend(override.aes = list(size=5)))+
  theme_ipsum_rc(base_size = 14,axis_title_just = 'm',axis_title_size = 16,grid='XY',ticks = T)+
  panel_border(color = 'black',size = 0.5)

#ggsave(filename = './output/ID2_hq_tumor_genomic_association.pdf',width = 7.5,height = 5,device = cairo_pdf)

7 Fig. 3f Assocaition between Hypoxia score and mutational signature ID2

Distribution of estimated hypoxia scores between tumors with ID2 signatures and those without. P-values from the Wilcoxon rank-sum test are displayed above the plot.

# ID2 and hypoxia 
library(ggdist)
tresult <- sherlock_variable %>% 
  left_join(id2data) %>% 
  group_by(name) %>% 
  do(tidy(wilcox.test(value~ID2_Present,data=.)))

tresult %>% 
  filter(str_detect(name,'hypoxia_score'))
## # A tibble: 7 × 5
## # Groups:   name [7]
##   name                                statistic  p.value method      alternative
##   <chr>                                   <dbl>    <dbl> <chr>       <chr>      
## 1 hypoxia_score_Combined_GeneSet         28956. 7.54e-12 Wilcoxon r… two.sided  
## 2 hypoxia_score_buffa                    23364  6.67e-20 Wilcoxon r… two.sided  
## 3 hypoxia_score_elvidge_hypoxia_all      40244  2.52e- 2 Wilcoxon r… two.sided  
## 4 hypoxia_score_elvidge_hypoxia_short    38464. 2.96e- 3 Wilcoxon r… two.sided  
## 5 hypoxia_score_ragnum                   20218. 1.92e-25 Wilcoxon r… two.sided  
## 6 hypoxia_score_sorenson                 31078  2.13e- 9 Wilcoxon r… two.sided  
## 7 hypoxia_score_west                     25795  3.73e-16 Wilcoxon r… two.sided
tdata <- sherlock_variable %>% 
  filter(str_detect(name,'hypoxia_score')) %>% 
  filter(name=='hypoxia_score_Combined_GeneSet') %>% 
  left_join(id2data) %>% 
  filter(Tumor_Barcode %in% hq_samples2) %>% 
  left_join(sp_group_data2) %>% 
  ungroup()

tdata <- tdata %>% mutate(SP_Group_New='ALL') %>% bind_rows(tdata) #%>% filter(SP_Group_New!='Others')

tdata %>% 
  ggplot(aes(SP_Group_New,value,fill=ID2_Present))+
  stat_eye(side = "left",position = position_dodge(width = 0.2),col='black')+
  scale_fill_manual(values = id2color)+
  scale_y_continuous(breaks = pretty_breaks())+
  theme_ipsum_rc(base_size = 18,axis_title_just = 'm',axis_title_size = 22)+
  labs(x = "", y = 'Hypoxia scorer',fill='ID2 mutational signature')+
  #guides(fill="none")+
  theme(legend.position = 'top')+
  panel_border(color = 'black',linetype = 1)

#ggsave(file='./output/ID2_hypoxia_score.pdf',width = 9,height = 9,device = cairo_pdf)

tdata %>% group_by(SP_Group_New) %>% do(tidy(wilcox.test(value~ID2_Present,data=.)))
## # A tibble: 5 × 5
## # Groups:   SP_Group_New [5]
##   SP_Group_New statistic     p.value method                          alternative
##   <chr>            <dbl>       <dbl> <chr>                           <chr>      
## 1 ALL              7070. 0.000000214 Wilcoxon rank sum test with co… two.sided  
## 2 AS_N              856. 0.0118      Wilcoxon rank sum test with co… two.sided  
## 3 EU_N              536  0.00622     Wilcoxon rank sum test with co… two.sided  
## 4 EU_S              408. 0.0259      Wilcoxon rank sum test with co… two.sided  
## 5 Others            113  0.306       Wilcoxon rank sum test with co… two.sided

8 Additional Supplementary Figures ——————————————————————

# ATR -  CHECK1  vs ID2 -------------------------------------------------------------------------
tdata <- rdata1 %>% filter(Gene %in% c('CHEK1','CHEK2'))

tdata <- id2data %>% 
  left_join(tdata) %>% 
  filter(!is.na(Exp),!is.na(ID2)) %>% 
  left_join(wgs_groups_info)

tdata %>% 
  filter(RNAseq_Type=='Tumor') %>% 
  ggplot(aes(ID2_Present,Exp))+
  geom_point(aes(fill=ID2_Present),pch=21,size=2.5,position = position_jitter(width = 0.15,height = 0))+
  geom_boxplot(width=0.7,outlier.shape = NA,fill=NA)+
  facet_grid(RNAseq_Type~Gene,scales = 'free') +
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = 'Y')+
  guides(fill = guide_legend(override.aes=list(shape=21)))+
  labs(x='ID2 mutational signature',y='RNA-Seq Expression log2(CPM)')+
  theme(axis.text.x = element_text(angle = 45,hjust = 1,vjust = 1),panel.spacing = unit(0.25, "cm"))+
  panel_border(color = 'black',linetype = 1)+
  scale_fill_manual(values = c("#01665e","#ff7f00"))+
  guides(fill="none")

#ggsave('./output/ID2_Checkpoint_RNASeq.pdf',width = 5,height = 8,device = cairo_pdf)

tdata %>% 
  group_by(RNAseq_Type,Gene) %>% 
  do(tidy(wilcox.test(Exp~ID2_Present,data=.))) %>% 
  arrange(p.value)
## # A tibble: 4 × 6
## # Groups:   RNAseq_Type, Gene [4]
##   RNAseq_Type Gene  statistic  p.value method                        alternative
##   <chr>       <chr>     <dbl>    <dbl> <chr>                         <chr>      
## 1 Tumor       CHEK1     20147 1.56e-25 Wilcoxon rank sum test with … two.sided  
## 2 Tumor       CHEK2     23793 3.40e-19 Wilcoxon rank sum test with … two.sided  
## 3 Normal      CHEK1     23218 1.51e- 1 Wilcoxon rank sum test with … two.sided  
## 4 Normal      CHEK2     22859 2.38e- 1 Wilcoxon rank sum test with … two.sided
tdata %>% 
  filter(ID2>0) %>% 
  filter(RNAseq_Type=='Tumor') %>% 
  ggplot(aes(log2(ID2),Exp))+
  geom_point(aes(fill=RNAseq_Type),pch=21,stroke=0.2,size=2.5)+
  geom_smooth(method = 'lm')+
  facet_wrap(~Gene,scales = 'free',nrow = 2) +
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = 'Y',strip_text_face = 'bold')+
  labs(y='RNA-Seq Expression log2(CPM)',x='ID2 mutations (log2)')+
  theme(panel.spacing = unit(0.5,'lines'))+
  scale_fill_manual(values = c("#ff7f00"))+
  panel_border(color = 'black',linetype = 1)+
  guides(fill="none")

#ggsave('./output/ID2_Checkpoint_RNASeq2.pdf',width = 4.5,height = 8,device = cairo_pdf)

tdata %>% 
  group_by(RNAseq_Type,Gene) %>% 
  filter(ID2>0) %>% 
  do(tidy(cor.test(.$Exp,log2(.$ID2),data=.))) %>% 
  arrange(p.value)
## # A tibble: 4 × 10
## # Groups:   RNAseq_Type, Gene [4]
##   RNAseq_Type Gene  estimate statistic  p.value parameter conf.low conf.high
##   <chr>       <chr>    <dbl>     <dbl>    <dbl>     <int>    <dbl>     <dbl>
## 1 Tumor       CHEK1    0.510     12.5  1.07e-30       440    0.438    0.576 
## 2 Tumor       CHEK2    0.458     10.8  2.47e-24       440    0.381    0.529 
## 3 Normal      CHEK2   -0.164     -2.97 3.21e- 3       321   -0.268   -0.0554
## 4 Normal      CHEK1   -0.140     -2.54 1.17e- 2       321   -0.246   -0.0316
## # ℹ 2 more variables: method <chr>, alternative <chr>
# tdata %>% 
#   mutate(Exp=2^Exp) %>% 
#   group_by(RNAseq_Type,ID2) %>% 
#   summarise(mvalue=median(Exp))
# 

# Figure xxx: ID2 association with SV count and signature -----------------------------
my_comparisons <- list(c("Absent",'Present'))

tdata <- sherlock_variable %>% 
  filter(str_detect(name,'SV')) %>% 
  left_join(id2data) %>% 
  filter(Tumor_Barcode %in% hq_samples2)

tdata %>% group_by(name) %>% do(tidy(wilcox.test(log2(value+1) ~ ID2_Present, data=.)))
## # A tibble: 3 × 5
## # Groups:   name [3]
##   name       statistic  p.value method                               alternative
##   <chr>          <dbl>    <dbl> <chr>                                <chr>      
## 1 SV            15366. 3.57e- 9 Wilcoxon rank sum test with continu… two.sided  
## 2 SV_Complex    23420. 6.54e- 1 Wilcoxon rank sum test with continu… two.sided  
## 3 SV_Simple      9290. 1.26e-23 Wilcoxon rank sum test with continu… two.sided
tdata %>% 
  ggplot(aes(ID2_Present,log2(value+1),fill=ID2_Present))+
  geom_boxplot(width=0.6,outlier.shape = NA,alpha =0.25)+
  geom_point(pch=21,size=2.5,position = position_jitter(width = 0.15,height = 0),color="white",stroke=0.05)+
  scale_fill_manual(values = id2color)+
  facet_wrap(~name)+
  scale_y_continuous(breaks = pretty_breaks(n = 7))+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
  theme(plot.margin = margin(4,4,4,4),panel.spacing = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 4))+
  labs(x = "Mutational signature ID2", y = 'log2(SV count + 1)')+
  guides(fill="none")+
  panel_border(color = 'black',linetype = 1)+
  stat_compare_means(comparisons = my_comparisons)

#ggsave(filename = 'SV_ID2_present_vs_absent.pdf',width = 5,height = 6,device = cairo_pdf)

# SV profiles

tdata <- sherlock_sv32_profile %>% 
  separate(col = MutationType,into = c('a','b','c'),sep = '_') %>% group_by(Tumor_Barcode,a,b) %>% summarise(Contribution = sum(Contribution)) %>% ungroup() %>% 
  left_join(id2data) %>% 
  filter(Tumor_Barcode %in% hq_samples2)

tdata %>% group_by(a,b) %>% do(tidy(wilcox.test(log2(Contribution+1) ~ ID2_Present, data=.))) %>% ungroup()%>% arrange(p.value) %>% mutate(fdr=p.adjust(p.value)) %>% View()

tdata %>% 
  ggplot(aes(ID2_Present,log2(Contribution+1),fill=ID2_Present))+
  geom_boxplot(width=0.6,outlier.shape = NA,alpha =0.25)+
  geom_point(pch=21,size=2,position = position_jitter(width = 0.15,height = 0),color="white",stroke=0.05)+
  scale_fill_manual(values = id2color)+
  facet_grid(a~b)+
  scale_y_continuous(breaks = pretty_breaks(n = 7))+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
  theme(plot.margin = margin(4,4,4,4),panel.spacing = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 4))+
  labs(x = "Mutational signature ID2", y = 'log2(SV count + 1)')+
  guides(fill="none")+
  panel_border(color = 'black',linetype = 1)+
  stat_compare_means(comparisons = my_comparisons)

#ggsave(filename = './output/SV_subtypes_ID2_present_vs_absent.pdf',width = 6,height = 8,device = cairo_pdf)

my_comparisons <- list(c("Absent",'Present'))

tdata <- sherlock_variable %>% 
  filter(str_detect(name,'SV')) %>% 
  left_join(id2data) %>% 
  filter(Tumor_Barcode %in% hq_samples2) %>% 
  left_join(covdata0 %>% select(Tumor_Barcode, Smoking)) %>% 
  filter(Smoking!='Unknown') %>% 
  filter(name=='SV')

tdata %>% group_by(Smoking,name) %>% do(tidy(wilcox.test(log2(value+1) ~ ID2_Present, data=.)))
## # A tibble: 2 × 6
## # Groups:   Smoking, name [2]
##   Smoking    name  statistic     p.value method                      alternative
##   <fct>      <chr>     <dbl>       <dbl> <chr>                       <chr>      
## 1 Smoker     SV         1132 0.000000329 Wilcoxon rank sum test wit… two.sided  
## 2 Non-Smoker SV         8369 0.0393      Wilcoxon rank sum test wit… two.sided
tdata %>% 
  ggplot(aes(ID2_Present,log2(value+1),fill=ID2_Present))+
  geom_boxplot(width=0.6,outlier.shape = NA,alpha =0.25)+
  geom_point(pch=21,size=2.5,position = position_jitter(width = 0.15,height = 0),color="white",stroke=0.05)+
  scale_fill_manual(values = id2color)+
  facet_wrap(~Smoking)+
  scale_y_continuous(breaks = pretty_breaks(n = 7))+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
  theme(plot.margin = margin(4,4,4,4),panel.spacing = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 4))+
  labs(x = "Mutational signature ID2", y = 'log2(SV count + 1)')+
  guides(fill="none")+
  panel_border(color = 'black',linetype = 1)+
  stat_compare_means(comparisons = my_comparisons)

#ggsave(filename = './output/SV_ID2_present_vs_absent_Smoking.pdf',width = 4,height = 6,device = cairo_pdf)

# # Figure xxx: ID2 association with PGA and CNV signature ----------------
my_comparisons <- list(c("Absent",'Present'))
load('./data/sherlock_PGA.RData',verbose = T)
## Loading objects:
##   sherlock_PGA
tdata <- sherlock_PGA %>% 
  left_join(id2data) %>% 
  filter(Tumor_Barcode %in% hq_samples2)

tdata %>% do(tidy(wilcox.test(PGA_WGD ~ ID2_Present, data=.)))
## # A tibble: 1 × 4
##   statistic       p.value method                                     alternative
##       <dbl>         <dbl> <chr>                                      <chr>      
## 1     15293 0.00000000265 Wilcoxon rank sum test with continuity co… two.sided
tdata %>% 
  ggplot(aes(ID2_Present,PGA_WGD,fill=ID2_Present))+
  geom_boxplot(width=0.6,outlier.shape = NA,alpha =0.25)+
  geom_point(pch=21,size=2.5,position = position_jitter(width = 0.15,height = 0),color="white",stroke=0.05)+
  scale_fill_manual(values = id2color)+
  scale_y_continuous(breaks = pretty_breaks(n = 7),labels = percent_format())+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
  theme(plot.margin = margin(4,4,4,4),panel.spacing = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 4))+
  labs(x = "Mutational signature ID2", y = 'Percent genome altered by copy number')+
  guides(fill="none")+
  panel_border(color = 'black',linetype = 1)+
  stat_compare_means(comparisons = my_comparisons)

#ggsave(filename = './output/PGA_ID2_present_vs_absent.pdf',width = 2.7,height = 6,device = cairo_pdf)
my_comparisons <- list(c("Absent",'Present'))

tdata <- sherlock_PGA %>% 
  left_join(id2data) %>% 
  filter(Tumor_Barcode %in% hq_samples2) %>% 
  left_join(covdata0 %>% select(Tumor_Barcode, Smoking)) %>% 
  filter(Smoking!='Unknown')

tdata %>% do(tidy(wilcox.test(PGA_WGD ~ ID2_Present, data=.)))
## # A tibble: 1 × 4
##   statistic       p.value method                                     alternative
##       <dbl>         <dbl> <chr>                                      <chr>      
## 1     15287 0.00000000302 Wilcoxon rank sum test with continuity co… two.sided
tdata %>% 
  ggplot(aes(ID2_Present,PGA_WGD,fill=ID2_Present))+
  geom_boxplot(width=0.6,outlier.shape = NA,alpha =0.25)+
  geom_point(pch=21,size=2.5,position = position_jitter(width = 0.15,height = 0),color="white",stroke=0.05)+
  facet_wrap(~Smoking)+
  scale_fill_manual(values = id2color)+
  scale_y_continuous(breaks = pretty_breaks(n = 7),labels = percent_format())+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
  theme(plot.margin = margin(4,4,4,4),panel.spacing = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 4))+
  labs(x = "Mutational signature ID2", y = 'Percent genome altered by copy number')+
  guides(fill="none")+
  panel_border(color = 'black',linetype = 1)+
  stat_compare_means(comparisons = my_comparisons)

#ggsave(filename = './output/PGA_ID2_present_vs_absent_smoking.pdf',width = 4,height = 6,device = cairo_pdf)

# Signature
# Figure 2c-d ---------------------------------------------------------------
nmutation_input <-  0
conflicts_prefer(dplyr::lag)
source('./functions/Sigvisualfunc.R')
load('./data/Sigvisualfunc.RData')

sigcol <- ncicolpal[1:length(colnames(cn68_activity)[-1])]
names(sigcol) <- colnames(cn68_activity)[-1]
nmutation_input <- 0
tmp <- id2data %>% filter(Tumor_Barcode %in% hq_samples2,ID2_Present == "Present")%>% pull(Tumor_Barcode)
sigdata <- cn68_activity %>% rename(Samples=Tumor_Barcode) %>% filter(Samples %in% tmp)
data1 <- prevalence_plot(sigdata = sigdata,nmutation = nmutation_input,output_plot = paste0('./output/ID2_present_prevalence.pdf'),colset = sigcol,rel_widths = c(1,4))

tmp <- id2data %>% filter(Tumor_Barcode %in% hq_samples2,ID2_Present == "Absent")%>% pull(Tumor_Barcode)
sigdata <- cn68_activity %>% rename(Samples=Tumor_Barcode) %>% filter(Samples %in% tmp)
data2 <- prevalence_plot(sigdata = sigdata,nmutation = nmutation_input,output_plot = paste0('./output/ID2_absent_prevalence.pdf'),colset = sigcol,rel_widths = c(1,4))

tdata <- bind_rows(
  data1$freq_data %>% mutate(Group='Present'),
  data2$freq_data %>% mutate(Group='Absent')
)

#myggstyle()

plotdata <- tdata %>% 
  filter(Type == 'Prevalence by samples') %>% 
  arrange(desc(Value)) %>% 
  mutate(Signature = fct_inorder(Signature)) %>% 
  mutate(Value2=if_else(Value<0.1,0.1,Value)) %>% 
  mutate(Value = if_else(Group=='Absent',-1*Value,Value)) %>% 
  mutate(Value2 = if_else(Group=='Absent',-1*Value2,Value2)) 

p1 <- plotdata %>% 
  ggplot(aes(Value, y=Signature,fill=Signature))+
  geom_col(width = 0.75,col='gray10',linewidth=0.3)+
  geom_vline(xintercept = 0,col='gray10')+
  geom_text(data = plotdata %>% filter(Group=='Present'),aes(label=Percentage),vjust=0.5,hjust=2)+
  geom_text(data = plotdata %>% filter(Group=='Absent'),aes(label=Percentage),vjust=0.5,hjust=-1)+
  scale_fill_manual(values = sigcol)+
  scale_x_continuous(limits = c(-1,1),breaks = c(-1,-0.75,-0.5,-0.25,0,0.25,0.5,0.75,1),labels = c("100%","75%","50%","25%","0%","25%","50%","75%","100%"))+
  labs(x='Prevalence by samples (%)',y=NULL,fill='Group')+
  theme_ipsum_rc(base_size = 12,axis_title_just = 'm',axis_title_size = 12,grid = 'Y',ticks = T,plot_margin=margin(5.5,5.5,5.5,5.5))+
  panel_border(color = 'gray10',size = 1)+
  theme(legend.position = 'none',panel.grid.major.y = element_line(linetype = 2,colour = 'gray90'))

p1

testdata <- cn68_activity %>%
  pivot_longer(-Tumor_Barcode) %>% 
  mutate(value=if_else(value>0,'Present','Absent')) %>% 
  mutate(value=factor(value,levels=c('Absent','Present'))) %>% 
  left_join(id2data) %>% 
  filter(Tumor_Barcode %in% hq_samples2,ID2_Present %in% c('Absent','Present')) %>% 
  mutate(ID2_Present = factor(ID2_Present, levels=c('Absent','Present'))) %>% 
  group_by(name) %>% 
  mutate(value=as.factor(value)) %>% 
  do(tresult = safely(stats::fisher.test)(.$value,.$ID2_Present)) %>% 
  mutate(tresult_null = map_lgl(tresult['result'], is.null)) %>% 
  filter(!tresult_null) %>% 
  mutate(fit = list(tidy(tresult[['result']]))) %>% 
  select(name,fit) %>% 
  unnest(cols = c(fit)) %>% 
  ungroup() %>% 
  arrange(p.value) %>% 
  mutate(FDR=p.adjust(p.value,method='BH'))

p2 <- testdata %>% 
  ggplot(aes(log2(estimate),-log10(FDR),fill=name))+
  geom_hline(yintercept = -log10(0.05),col=ncicolpal[2],linetype=2)+
  geom_hline(yintercept = -log10(0.01),col=ncicolpal[1],linetype=2)+
  geom_vline(xintercept = 0,col="gray10",linetype=1,linewidth=0.5)+
  geom_point(pch=21,size=3.5,col='black',stroke=0.2)+
  ggrepel::geom_text_repel(data=testdata %>% filter(FDR<0.05),aes(label=name))+
  scale_fill_manual(values = sigcol)+
  scale_x_continuous(breaks = pretty_breaks(n = 7),limits = c(-4.5,4.5),position = "top")+
  scale_y_continuous(breaks = pretty_breaks(n=7))+
  theme_ipsum_rc(base_size = 12,axis_title_just = 'm',axis_title_size = 12,grid = F,ticks = T,plot_margin=margin(5.5,5.5,5.5,5.5))+
  labs(x = 'Odd ratio (log2)', y = '-log10(FDR)\n')+
  guides(fill="none")+
  panel_border(color = 'black',linetype = 1,size=0.5)+
  coord_cartesian(clip = 'off')

p2

plot_grid(p2,p1,align = 'v',axis = 'lr',ncol = 1,rel_heights = c(2,2))

#ggsave(filename = './output/CNV_signature_ID2_present_vs_absent.pdf',width = 6,height = 5.5,device = cairo_pdf)

9 R session information

sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS 15.5
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] janitor_2.2.1      ggforce_0.5.0      ggtext_0.1.2       ggdist_3.3.3      
##  [5] survival_3.8-3     survminer_0.5.0    ggpubr_0.6.1       ggcorrplot_0.1.4.1
##  [9] showtext_0.9-7     showtextdb_3.0     sysfonts_0.8.9     conflicted_1.2.0  
## [13] ggsci_3.2.0        broom_1.0.8        ggpmisc_0.6.2      ggpp_0.5.9        
## [17] ggasym_0.1.6       data.table_1.17.8  ggnewscale_0.5.2   ggrepel_0.9.6     
## [21] cowplot_1.2.0      hrbrthemes_0.8.7   scales_1.4.0       ggsankey_0.0.99999
## [25] lubridate_1.9.4    forcats_1.0.0      stringr_1.5.1      dplyr_1.1.4       
## [29] purrr_1.0.4        readr_2.1.5        tidyr_1.3.1        tibble_3.3.0      
## [33] ggplot2_3.5.2      tidyverse_2.0.0   
## 
## loaded via a namespace (and not attached):
##  [1] polynom_1.4-1           gridExtra_2.3           rlang_1.1.6            
##  [4] magrittr_2.0.3          snakecase_0.11.1        compiler_4.3.2         
##  [7] mgcv_1.9-1              systemfonts_1.2.3       vctrs_0.6.5            
## [10] reshape2_1.4.4          quantreg_6.1            pkgconfig_2.0.3        
## [13] fastmap_1.2.0           backports_1.5.0         labeling_0.4.3         
## [16] KMsurv_0.1-6            utf8_1.2.6              rmarkdown_2.29         
## [19] tzdb_0.5.0              ragg_1.4.0              MatrixModels_0.5-4     
## [22] xfun_0.52               cachem_1.1.0            jsonlite_2.0.0         
## [25] tweenr_2.0.3            R6_2.6.1                bslib_0.9.0            
## [28] stringi_1.8.7           RColorBrewer_1.1-3      car_3.1-3              
## [31] extrafontdb_1.0         jquerylib_0.1.4         Rcpp_1.1.0             
## [34] knitr_1.50              zoo_1.8-14              extrafont_0.19         
## [37] Matrix_1.6-5            splines_4.3.2           timechange_0.3.0       
## [40] tidyselect_1.2.1        rstudioapi_0.17.1       dichromat_2.0-0.1      
## [43] abind_1.4-8             yaml_2.3.10             lattice_0.22-7         
## [46] plyr_1.8.9              withr_3.0.2             evaluate_1.0.4         
## [49] polyclip_1.10-7         xml2_1.3.8              survMisc_0.5.6         
## [52] pillar_1.11.0           carData_3.0-5           distributional_0.5.0   
## [55] generics_0.1.4          hms_1.1.3               xtable_1.8-4           
## [58] glue_1.8.0              gdtools_0.4.2           tools_4.3.2            
## [61] SparseM_1.84-2          ggsignif_0.6.4          grid_4.3.2             
## [64] Rttf2pt1_1.3.12         nlme_3.1-168            Formula_1.2-5          
## [67] cli_3.6.5               km.ci_0.5-6             textshaping_1.0.1      
## [70] fontBitstreamVera_0.1.1 gtable_0.3.6            rstatix_0.7.2          
## [73] sass_0.4.10             digest_0.6.37           fontquiver_0.2.1       
## [76] farver_2.1.2            memoise_2.0.1           htmltools_0.5.8.1      
## [79] lifecycle_1.0.4         fontLiberation_0.1.0    gridtext_0.1.5         
## [82] MASS_7.3-60.0.1